from IPython.core.display import display, HTML, Javascript
# CSS styling for markdown
styling = """
<style>
.main-heading{
background-color: #292929;
color: #BACFFE;
font-family: Helvetica;
font-size: 32px !important;
padding: 12px 12px;
margin-bottom: 5px;
border-radius: 4px;
box-shadow: rgba(0, 0, 0, 0.19) 0px 10px 20px, rgba(0, 0, 0, 0.23) 0px 6px 6px;
}
.table-heading{
color: #292929;
font-family: Helvetica;
}
.sub-heading{
width: auto !important;
background-color: #292929;
color: #BACFFE;
font-family: Helvetica;
font-size: 24px !important;
padding: 10px 12px;
margin-bottom: 3px;
box-shadow: rgba(0, 0, 0, 0.16) 0px 3px 6px, rgba(0, 0, 0, 0.23) 0px 3px 6px;
}
.default-font-color{
color: rgba(0,0,0,0.7) !important;
}
.highlight-gray {
background: #292929;
color: white;
padding: 1px 3px;
}
.highlight-babyblue{
background: #707D99;
color: white;
padding: 1px 3px;
}
</style>
"""
HTML(styling)
Our Data is a combination of three different datasets (Data analyst jobs, Data Engineer jobs, Data Scientist jobs). Data is one of the most exciting fields now Because companies hold a treasure trove of data so there are new jobs appear in the market like data scientist, data engineer, and data analyst to get insights from the data.
And we decided to help candidates hunt new jobs and get more information about the US market through our data-driven story.
#import libraries
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objs as go
from geopy.geocoders import Nominatim
from tqdm.notebook import trange, tqdm
from geopy import geocoders
from geopy.exc import GeocoderTimedOut
#read the datasets
df_analyst = pd.read_csv('DataAnalyst.csv')
df_engineer = pd.read_csv('DataEngineer.csv')
df_scientist = pd.read_csv('DataScientist.csv')
df_analyst['Job Title']='Data Analyst'
df_engineer['Job Title']='Data Engineer'
df_scientist['Job Title']='Data Scientist'
#combine all datasets
df = pd.concat([df_analyst, df_engineer, df_scientist],sort=True)
df.head(2)
| Company Name | Competitors | Easy Apply | Founded | Headquarters | Industry | Job Description | Job Title | Location | Rating | Revenue | Salary Estimate | Sector | Size | Type of ownership | Unnamed: 0 | index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Vera Institute of Justice\n3.2 | -1 | True | 1961 | New York, NY | Social Assistance | Are you eager to roll up your sleeves and harn... | Data Analyst | New York, NY | 3.2 | $100 to $500 million (USD) | $37K-$66K (Glassdoor est.) | Non-Profit | 201 to 500 employees | Nonprofit Organization | 0.0 | NaN |
| 1 | Visiting Nurse Service of New York\n3.8 | -1 | -1 | 1893 | New York, NY | Health Care Services & Hospitals | Overview\n\nProvides analytical and technical ... | Data Analyst | New York, NY | 3.8 | $2 to $5 billion (USD) | $37K-$66K (Glassdoor est.) | Health Care | 10000+ employees | Nonprofit Organization | 1.0 | NaN |
df.tail(2)
| Company Name | Competitors | Easy Apply | Founded | Headquarters | Industry | Job Description | Job Title | Location | Rating | Revenue | Salary Estimate | Sector | Size | Type of ownership | Unnamed: 0 | index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3907 | Data Resource Technologies\n4.0 | -1 | -1 | -1 | Omaha, NE | Accounting | The Security Analytics Data Engineer will inte... | Data Scientist | Dublin, OH | 4.0 | Less than $1 million (USD) | $55K-$112K (Glassdoor est.) | Accounting & Legal | 1 to 50 employees | Company - Private | 3907.0 | 4378.0 |
| 3908 | UCB\n3.7 | AbbVie | -1 | -1 | Brussel, Belgium | Biotech & Pharmaceuticals | Help us transform patients' lives.\nAt UCB, we... | Data Scientist | Slough, OH | 3.7 | $2 to $5 billion (USD) | $55K-$112K (Glassdoor est.) | Biotech & Pharmaceuticals | 5001 to 10000 employees | Company - Public | 3908.0 | 4379.0 |
#check nulls
df.isnull().sum()
Company Name 1 Competitors 0 Easy Apply 0 Founded 0 Headquarters 0 Industry 0 Job Description 0 Job Title 0 Location 0 Rating 0 Revenue 0 Salary Estimate 0 Sector 0 Size 0 Type of ownership 0 Unnamed: 0 2528 index 4781 dtype: int64
we have lot of missing values in 'index' / 'unnamed: 0'
we need to make -1 as a missing values
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 8690 entries, 0 to 3908 Data columns (total 17 columns): Company Name 8689 non-null object Competitors 8690 non-null object Easy Apply 8690 non-null object Founded 8690 non-null int64 Headquarters 8690 non-null object Industry 8690 non-null object Job Description 8690 non-null object Job Title 8690 non-null object Location 8690 non-null object Rating 8690 non-null float64 Revenue 8690 non-null object Salary Estimate 8690 non-null object Sector 8690 non-null object Size 8690 non-null object Type of ownership 8690 non-null object Unnamed: 0 6162 non-null float64 index 3909 non-null float64 dtypes: float64(3), int64(1), object(13) memory usage: 1.2+ MB
'Revenue' / 'Salary Estimate' / 'Size' not numeric and need to be cleaned
we don't need Headquarters, Founded, Easy to Apply, Competitors, Type of ownership in our case, so we need to remove it and just use location
df.duplicated().sum()
14
df.Location.value_counts ()
New York, NY 800
Chicago, IL 652
Austin, TX 625
San Diego, CA 561
Houston, TX 490
San Antonio, TX 412
Philadelphia, PA 401
Los Angeles, CA 392
Dallas, TX 348
Phoenix, AZ 317
Irving, TX 259
Jacksonville, FL 209
San Jose, CA 183
Columbus, OH 181
Fort Worth, TX 151
Plano, TX 132
San Francisco, CA 120
Santa Clara, CA 116
Tempe, AZ 111
Sunnyvale, CA 92
Charlotte, NC 78
Mountain View, CA 78
Scottsdale, AZ 76
Palo Alto, CA 62
Burbank, CA 58
Menlo Park, CA 55
Redwood City, CA 53
Chandler, AZ 53
El Segundo, CA 52
Denver, CO 47
...
Mount Vernon, NY 1
Naperville, IL 1
Beech Grove, IN 1
Woodbridge, NJ 1
Smithfield, VA 1
Indian Trail, NC 1
Jeffersonville, IN 1
Spring Valley, CA 1
Lockbourne, OH 1
Montvale, NJ 1
Great Neck, NY 1
Deer Park, TX 1
Whitestown, IN 1
Essex Fells, NJ 1
Signal Hill, CA 1
Alameda, CA 1
Carle Place, NY 1
Newtown Square, PA 1
Missouri City, TX 1
Middletown, NJ 1
Virginia Beach, VA 1
Luke AFB, AZ 1
Millbrae, CA 1
Maywood, NJ 1
Encino, CA 1
Burlingame, KS 1
Lemont, IL 1
Yorktown, VA 1
Woodcliff Lake, NJ 1
Nassau Bay, TX 1
Name: Location, Length: 334, dtype: int64
df['Salary Estimate'].value_counts ()
$60K-$124K (Glassdoor est.) 62
$93K-$151K (Glassdoor est.) 62
$60K-$109K (Glassdoor est.) 62
$94K-$166K (Glassdoor est.) 62
$113K-$180K (Glassdoor est.) 61
$43K-$76K (Glassdoor est.) 61
$44K-$80K (Glassdoor est.) 60
$71K-$122K (Glassdoor est.) 60
$111K-$175K (Glassdoor est.) 60
$76K-$122K (Glassdoor est.) 58
$65K-$120K (Glassdoor est.) 58
$38K-$68K (Glassdoor est.) 57
$41K-$78K (Glassdoor est.) 57
$42K-$76K (Glassdoor est.) 57
$55K-$101K (Glassdoor est.) 55
$74K-$123K (Glassdoor est.) 47
$75K-$135K (Glassdoor est.) 44
$119K-$147K (Glassdoor est.) 44
$82K-$130K(Employer est.) 42
$50K-$86K (Glassdoor est.) 41
$100K-$172K (Glassdoor est.) 37
$35K-$67K (Glassdoor est.) 33
$164K-$192K (Glassdoor est.) 32
$79K-$96K (Glassdoor est.) 32
$105K-$121K (Glassdoor est.) 32
$81K-$141K (Glassdoor est.) 32
$54K-$103K (Glassdoor est.) 32
$95K-$99K (Glassdoor est.) 32
$87K-$104K (Glassdoor est.) 32
$84K-$113K (Glassdoor est.) 32
..
$92K-$117K (Glassdoor est.) 10
$81K-$100K (Glassdoor est.) 10
$40K-$74K (Glassdoor est.) 9
$66K-$130K (Glassdoor est.) 9
$47K-$82K (Glassdoor est.) 9
$93K-$103K (Glassdoor est.) 9
$55K-$98K (Glassdoor est.) 9
$66K-$122K (Glassdoor est.) 9
$118K-$187K (Glassdoor est.) 8
$70K-$138K (Glassdoor est.) 8
$30K-$54K (Glassdoor est.) 8
$110K-$169K (Glassdoor est.) 8
$34-$53 Per Hour(Glassdoor est.) 7
$114K-$185K (Glassdoor est.) 7
$39K-$71K (Glassdoor est.) 7
$57K-$92K (Glassdoor est.) 7
$58K-$94K (Glassdoor est.) 7
$99K-$173K (Glassdoor est.) 6
$57K-$97K (Glassdoor est.) 5
$32K-$56K (Glassdoor est.) 4
$31K-$59K (Glassdoor est.) 4
$42K-$63K (Glassdoor est.) 4
$10-$26 Per Hour(Glassdoor est.) 3
$146K-$175K (Glassdoor est.) 3
$36K-$67K (Glassdoor est.) 3
$47K-$81K (Glassdoor est.) 3
$43K-$77K (Glassdoor est.) 3
$57K-$70K (Glassdoor est.) 2
$41K-$74K (Glassdoor est.) 1
-1 1
Name: Salary Estimate, Length: 318, dtype: int64
df['Sector'].value_counts ()
Information Technology 2584 Business Services 1619 -1 1274 Finance 616 Health Care 399 Biotech & Pharmaceuticals 361 Insurance 242 Manufacturing 220 Education 181 Media 169 Aerospace & Defense 153 Retail 146 Government 142 Accounting & Legal 141 Oil, Gas, Energy & Utilities 84 Telecommunications 64 Transportation & Logistics 60 Consumer Services 47 Non-Profit 46 Real Estate 40 Construction, Repair & Maintenance 39 Restaurants, Bars & Food Services 24 Arts, Entertainment & Recreation 22 Travel & Tourism 9 Agriculture & Forestry 6 Mining & Metals 2 Name: Sector, dtype: int64
df['Revenue'].value_counts ()
Unknown / Non-Applicable 2492 $10+ billion (USD) 1181 $100 to $500 million (USD) 830 $50 to $100 million (USD) 563 -1 511 $2 to $5 billion (USD) 475 $10 to $25 million (USD) 454 $1 to $5 million (USD) 385 $25 to $50 million (USD) 373 $1 to $2 billion (USD) 344 Less than $1 million (USD) 299 $5 to $10 billion (USD) 287 $500 million to $1 billion (USD) 261 $5 to $10 million (USD) 235 Name: Revenue, dtype: int64
# remove unused columns
df = df.drop(['Unnamed: 0', 'index', 'Headquarters', 'Type of ownership',
'Competitors', 'Easy Apply','Founded', 'Industry'], axis=1)
#replace -1 with unknown and remove missing values
#replace -1 with unknown and remove missing values
df = df.replace('-1', 'Unknown', regex=True)#for object columns
df = df.replace(-1, np.nan)#for quantitive columns
df = df.replace(-1.0, np.nan) # for Rating column
df = df[df['Salary Estimate'] != 'Unknown']
df = df.replace('Unknown', np.nan)
#drop rows with missing data
df = df.dropna()
#remove duplicate rows
df = df.drop_duplicates()
df = df.reset_index(drop=True);
df.sample(10)
| Company Name | Job Description | Job Title | Location | Rating | Revenue | Salary Estimate | Sector | Size | |
|---|---|---|---|---|---|---|---|---|---|
| 6604 | Moveworks\n5.0 | As an AI Insights Data Analyst at Moveworks, y... | Data Scientist | Mountain View, CA | 5.0 | Unknown / Non-Applicable | $141K-$225K (Glassdoor est.) | Information Technology | 51 to 200 employees |
| 409 | Staffigo Technical Services, LLC\n5.0 | Job Description\nJob description\nInterpret da... | Data Analyst | Gainesville, FL | 5.0 | $50 to $100 million (USD) | $30K-$54K (Glassdoor est.) | Information Technology | 51 to 200 employees |
| 6366 | BetterHelp\n4.4 | Who are we and why join us?\nWe help people ch... | Data Scientist | Mountain View, CA | 4.4 | $50 to $100 million (USD) | $111K-$175K (Glassdoor est.) | Information Technology | 51 to 200 employees |
| 2327 | Tempus Labs\n3.2 | Passionate about precision medicine and advanc... | Data Engineer | Chicago, IL | 3.2 | Unknown / Non-Applicable | $53K-$100K (Glassdoor est.) | Biotech & Pharmaceuticals | 501 to 1000 employees |
| 4795 | Tempus Labs\n3.2 | Passionate about precision medicine and advanc... | Data Scientist | Chicago, IL | 3.2 | Unknown / Non-Applicable | $60K-$105K (Glassdoor est.) | Biotech & Pharmaceuticals | 501 to 1000 employees |
| 4255 | Zest AI\n3.7 | Data Scientist's at Zest AI use the power of m... | Data Scientist | Burbank, CA | 3.7 | $50 to $100 million (USD) | $113K-$180K (Glassdoor est.) | Finance | 51 to 200 employees |
| 2807 | Amerihealth\n3.7 | Your career starts now. Were looking for the n... | Data Engineer | Philadelphia, PA | 3.7 | $25 to $50 million (USD) | $71K-$130K(Employer est.) | Insurance | 5001 to 10000 employees |
| 3611 | Kazoo\n4.0 | Overview of the role:\n\nWe’re looking for a f... | Data Engineer | Austin, TX | 4.0 | Unknown / Non-Applicable | $52K-$85K (Glassdoor est.) | Information Technology | 51 to 200 employees |
| 7235 | Tachyon Technologies\n4.4 | About Us\n\nTachyon Technologies is a Digital ... | Data Scientist | Dublin, OH | 4.4 | $10 to $25 million (USD) | $55K-$112K (Glassdoor est.) | Information Technology | 201 to 500 employees |
| 6492 | Apple\n4.1 | Posted: May 23, 2019\nRole Number:\n200063205\... | Data Scientist | Santa Clara, CA | 4.1 | $10+ billion (USD) | $119K-$188K (Glassdoor est.) | Information Technology | 10000+ employees |
hours_per_week = 40
weeks_per_year = 52
for i in range(df.shape[0]):
salary_estimate = df.loc[i,"Salary Estimate"]
salary_estimate = salary_estimate.replace("$", "")
if "Per Hour" in salary_estimate:
lower, upper = salary_estimate.split("-")
upper, _ = upper.split("Per")
upper= upper.strip()
lower = int(lower)*hours_per_week*weeks_per_year*(1/1000)
upper = int(upper)*hours_per_week*weeks_per_year*(1/1000)
else:
lower, upper = salary_estimate.split("-")
lower = lower.replace("K", "")
upper, _= upper.split("(")
upper=upper.replace("K", "")
upper = upper.strip()
lower = int(lower)
upper = int(upper)
df.loc[i,"salary lower bound"] = lower
df.loc[i,"salary upper bound"] = upper
df = df.drop('Salary Estimate', axis=1)
df['avg salary'] = (df['salary lower bound'] + df['salary upper bound']) / 2
df.head()
| Company Name | Job Description | Job Title | Location | Rating | Revenue | Sector | Size | salary lower bound | salary upper bound | avg salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Vera Institute of Justice\n3.2 | Are you eager to roll up your sleeves and harn... | Data Analyst | New York, NY | 3.2 | $100 to $500 million (USD) | Non-Profit | 201 to 500 employees | 37.0 | 66.0 | 51.5 |
| 1 | Visiting Nurse Service of New York\n3.8 | Overview\n\nProvides analytical and technical ... | Data Analyst | New York, NY | 3.8 | $2 to $5 billion (USD) | Health Care | 10000+ employees | 37.0 | 66.0 | 51.5 |
| 2 | Squarespace\n3.4 | We’re looking for a Senior Data Analyst who ha... | Data Analyst | New York, NY | 3.4 | Unknown / Non-Applicable | Information Technology | 1001 to 5000 employees | 37.0 | 66.0 | 51.5 |
| 3 | Celerity\n4.1 | Requisition NumberRR-0001939\nRemote:Yes\nWe c... | Data Analyst | New York, NY | 4.1 | $50 to $100 million (USD) | Information Technology | 201 to 500 employees | 37.0 | 66.0 | 51.5 |
| 4 | FanDuel\n3.9 | ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl... | Data Analyst | New York, NY | 3.9 | $100 to $500 million (USD) | Arts, Entertainment & Recreation | 501 to 1000 employees | 37.0 | 66.0 | 51.5 |
df["Company Name"] = df["Company Name"].str.split("\n").str[0]
df["Company Name"].head()
0 Vera Institute of Justice 1 Visiting Nurse Service of New York 2 Squarespace 3 Celerity 4 FanDuel Name: Company Name, dtype: object
df[["City", "State", "None"]] = pd.DataFrame(df.Location.str.split(",", expand = True))
df.head(3)
| Company Name | Job Description | Job Title | Location | Rating | Revenue | Sector | Size | salary lower bound | salary upper bound | avg salary | City | State | None | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Vera Institute of Justice | Are you eager to roll up your sleeves and harn... | Data Analyst | New York, NY | 3.2 | $100 to $500 million (USD) | Non-Profit | 201 to 500 employees | 37.0 | 66.0 | 51.5 | New York | NY | None |
| 1 | Visiting Nurse Service of New York | Overview\n\nProvides analytical and technical ... | Data Analyst | New York, NY | 3.8 | $2 to $5 billion (USD) | Health Care | 10000+ employees | 37.0 | 66.0 | 51.5 | New York | NY | None |
| 2 | Squarespace | We’re looking for a Senior Data Analyst who ha... | Data Analyst | New York, NY | 3.4 | Unknown / Non-Applicable | Information Technology | 1001 to 5000 employees | 37.0 | 66.0 | 51.5 | New York | NY | None |
df['None'].value_counts ()
CO 7 Name: None, dtype: int64
df['State'].value_counts ()
TX 2206 CA 1915 NY 679 IL 618 PA 520 AZ 497 OH 181 FL 181 NJ 113 CO 79 NC 74 WA 43 VA 41 DE 29 UT 23 IN 22 Arapahoe 7 United Kingdom 4 GA 4 KS 2 SC 2 Name: State, dtype: int64
arapahoe = df["State"].str.contains("Arapahoe")
df[arapahoe]
| Company Name | Job Description | Job Title | Location | Rating | Revenue | Sector | Size | salary lower bound | salary upper bound | avg salary | City | State | None | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1777 | Collabera | Greenwood Village, Colorado\nSkills : sql\nDes... | Data Analyst | Greenwood Village, Arapahoe, CO | 4.1 | $500 million to $1 billion (USD) | Information Technology | 10000+ employees | 57.0 | 67.0 | 62.0 | Greenwood Village | Arapahoe | CO |
| 1787 | Apex Systems | Job Description\n\n\nJob #: 1067357\n\nApex Sy... | Data Analyst | Greenwood Village, Arapahoe, CO | 3.8 | $2 to $5 billion (USD) | Business Services | 1001 to 5000 employees | 57.0 | 67.0 | 62.0 | Greenwood Village | Arapahoe | CO |
| 1795 | Collabera | Greenwood Village, Colorado\nSkills : SQL,ETL,... | Data Analyst | Greenwood Village, Arapahoe, CO | 4.1 | $500 million to $1 billion (USD) | Information Technology | 10000+ employees | 49.0 | 91.0 | 70.0 | Greenwood Village | Arapahoe | CO |
| 1803 | Century Communities | Working at Century Communities means building ... | Data Analyst | Greenwood Village, Arapahoe, CO | 3.5 | $500 million to $1 billion (USD) | Construction, Repair & Maintenance | 501 to 1000 employees | 49.0 | 91.0 | 70.0 | Greenwood Village | Arapahoe | CO |
| 1832 | Apex Systems | Job Description\n\n\nJob #: 1068993\n\nApex Sy... | Data Analyst | Greenwood Village, Arapahoe, CO | 3.8 | $2 to $5 billion (USD) | Business Services | 1001 to 5000 employees | 57.0 | 100.0 | 78.5 | Greenwood Village | Arapahoe | CO |
| 1838 | Telligen | Senior Health Data Analyst (SAS)\n\nTelligen i... | Data Analyst | Greenwood Village, Arapahoe, CO | 3.6 | $50 to $100 million (USD) | Health Care | 501 to 1000 employees | 57.0 | 100.0 | 78.5 | Greenwood Village | Arapahoe | CO |
| 1839 | ClientSolv Technologies | Company Description\n\nClientSolv Technologies... | Data Analyst | Greenwood Village, Arapahoe, CO | 3.3 | $10 to $25 million (USD) | Information Technology | 51 to 200 employees | 57.0 | 100.0 | 78.5 | Greenwood Village | Arapahoe | CO |
# Rectify. CO should be in "State column"
df["State"] = df["State"].str.replace("Arapahoe","CO")
#remove United Kingdom from data
df = df[df['State'] != ' United Kingdom']
df = df.drop(['None'], axis=1)
df.sample(5)
| Company Name | Job Description | Job Title | Location | Rating | Revenue | Sector | Size | salary lower bound | salary upper bound | avg salary | City | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1541 | Vionic Group | Digital Marketing & eCommerce Data Analyst\n\n... | Data Analyst | San Rafael, CA | 3.6 | $100 to $500 million (USD) | Retail | 51 to 200 employees | 57.0 | 104.0 | 80.5 | San Rafael | CA |
| 5218 | Discount Tire | Job Description\n\n\nHere at Discount Tire, we... | Data Scientist | Scottsdale, AZ | 4.0 | $2 to $5 billion (USD) | Retail | 10000+ employees | 84.0 | 156.0 | 120.0 | Scottsdale | AZ |
| 6928 | Molecular Templates Inc | Company Description:\n\nMolecular Templates (M... | Data Scientist | Austin, TX | 4.0 | Less than $1 million (USD) | Biotech & Pharmaceuticals | 1 to 50 employees | 62.0 | 112.0 | 87.0 | Austin | TX |
| 308 | Riskified | About Us\n\n\nRiskified is the AI platform pow... | Data Analyst | New York, NY | 4.4 | Unknown / Non-Applicable | Business Services | 501 to 1000 employees | 98.0 | 114.0 | 106.0 | New York | NY |
| 6848 | Staffigo Technical Services, LLC | Job Description\nJob description\nInterpret da... | Data Scientist | Austin, TX | 5.0 | $50 to $100 million (USD) | Information Technology | 51 to 200 employees | 74.0 | 144.0 | 109.0 | Austin | TX |
def max_revenue(x):
revenue=0
if(x== 'Unknown / Non-Applicable' or type(x)==float):
revenue=0
elif(('million' in x) and ('billion' not in x)):
maxRev = x.replace('(USD)','').replace("million",'').replace('$','').strip().split('to')
if('Less than' in maxRev[0]):
revenue = float(maxRev[0].replace('Less than','').strip())
else:
if(len(maxRev)==2):
revenue = float(maxRev[1])
elif(len(maxRev)<2):
revenue = float(maxRev[0])
elif(('billion'in x)):
maxRev = x.replace('(USD)','').replace("billion",'').replace('$','').strip().split('to')
if('+' in maxRev[0]):
revenue = float(maxRev[0].replace('+','').strip())*1000
else:
if(len(maxRev)==2):
revenue = float(maxRev[1])*1000
elif(len(maxRev)<2):
revenue = float(maxRev[0])*1000
return revenue
#get maximum revenue
df['Max revenue']=df['Revenue'].apply(lambda x: max_revenue(x))
df = df.drop(['Revenue'], axis=1)
df.head()
| Company Name | Job Description | Job Title | Location | Rating | Sector | Size | salary lower bound | salary upper bound | avg salary | City | State | Max revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Vera Institute of Justice | Are you eager to roll up your sleeves and harn... | Data Analyst | New York, NY | 3.2 | Non-Profit | 201 to 500 employees | 37.0 | 66.0 | 51.5 | New York | NY | 500.0 |
| 1 | Visiting Nurse Service of New York | Overview\n\nProvides analytical and technical ... | Data Analyst | New York, NY | 3.8 | Health Care | 10000+ employees | 37.0 | 66.0 | 51.5 | New York | NY | 5000.0 |
| 2 | Squarespace | We’re looking for a Senior Data Analyst who ha... | Data Analyst | New York, NY | 3.4 | Information Technology | 1001 to 5000 employees | 37.0 | 66.0 | 51.5 | New York | NY | 0.0 |
| 3 | Celerity | Requisition NumberRR-0001939\nRemote:Yes\nWe c... | Data Analyst | New York, NY | 4.1 | Information Technology | 201 to 500 employees | 37.0 | 66.0 | 51.5 | New York | NY | 100.0 |
| 4 | FanDuel | ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl... | Data Analyst | New York, NY | 3.9 | Arts, Entertainment & Recreation | 501 to 1000 employees | 37.0 | 66.0 | 51.5 | New York | NY | 500.0 |
df = df[df['Max revenue'] != 0.0]
df.sample(5)
| Company Name | Job Description | Job Title | Location | Rating | Sector | Size | salary lower bound | salary upper bound | avg salary | City | State | Max revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4173 | Amazon | Excited by using massive amounts of data to de... | Data Scientist | New York, NY | 3.9 | Information Technology | 10000+ employees | 110.0 | 139.0 | 124.5 | New York | NY | 10000.0 |
| 4897 | GeoDigital | Shift Type\n\nJob Title\n\nData Analyst (LiDar... | Data Scientist | Houston, TX | 2.4 | Information Technology | 51 to 200 employees | 36.0 | 71.0 | 53.5 | Houston | TX | 50.0 |
| 7175 | J.P. Morgan | The Cybersecurity & Technology Controls group ... | Data Scientist | Columbus, OH | 3.9 | Finance | 10000+ employees | 55.0 | 113.0 | 84.0 | Columbus | OH | 10000.0 |
| 2459 | Bergaila | OverviewWe are currently seeking an experience... | Data Engineer | Houston, TX | 3.0 | Oil, Gas, Energy & Utilities | 201 to 500 employees | 54.0 | 113.0 | 83.5 | Houston | TX | 500.0 |
| 3736 | iVenture Solutions, Inc. | Our Network Engineers build and maintain a tac... | Data Engineer | Jacksonville, FL | 4.2 | Information Technology | 51 to 200 employees | 39.0 | 77.0 | 58.0 | Jacksonville | FL | 25.0 |
#make job title a categorical variable
df['Job Title'] = df['Job Title'].astype('category')
#make rating a ordinal variable
df['Rating'] = df['Rating'].astype('int64')
#make sector a categorical variable
df['Sector'] = df['Sector'].astype('category')
#make slary lower bound an intger variable
df['salary lower bound'] = df['salary lower bound'].astype('int64')
#make salary upper bound an intger variable
df['salary upper bound'] = df['salary upper bound'].astype('int64')
#make salary upper bound an intger variable
df['avg salary'] = df['avg salary'].astype('int64')
#make Max revenue an intger variable
df['Max revenue'] = df['Max revenue'].astype('int64')
df.head()
| Company Name | Job Description | Job Title | Location | Rating | Sector | Size | salary lower bound | salary upper bound | avg salary | City | State | Max revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Vera Institute of Justice | Are you eager to roll up your sleeves and harn... | Data Analyst | New York, NY | 3 | Non-Profit | 201 to 500 employees | 37 | 66 | 51 | New York | NY | 500 |
| 1 | Visiting Nurse Service of New York | Overview\n\nProvides analytical and technical ... | Data Analyst | New York, NY | 3 | Health Care | 10000+ employees | 37 | 66 | 51 | New York | NY | 5000 |
| 3 | Celerity | Requisition NumberRR-0001939\nRemote:Yes\nWe c... | Data Analyst | New York, NY | 4 | Information Technology | 201 to 500 employees | 37 | 66 | 51 | New York | NY | 100 |
| 4 | FanDuel | ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl... | Data Analyst | New York, NY | 3 | Arts, Entertainment & Recreation | 501 to 1000 employees | 37 | 66 | 51 | New York | NY | 500 |
| 7 | GNY Insurance Companies | Data Science Analyst\n\nJob Details\nLevel\nEx... | Data Analyst | New York, NY | 3 | Insurance | 201 to 500 employees | 37 | 66 | 51 | New York | NY | 500 |
#get datatypes of each column
df.dtypes
Company Name object Job Description object Job Title category Location object Rating int64 Sector category Size object salary lower bound int64 salary upper bound int64 avg salary int64 City object State object Max revenue int64 dtype: object
#save dataframe to csv
df.to_csv('data_cleaned.csv')
data_analyst =df[df['Job Title']=='Data Analyst']
data_scientist=df[df['Job Title']=='Data Scientist']
data_engineer=df[df['Job Title']=='Data Engineer']
fig = px.histogram(df, x= 'avg salary',color='Job Title' ,title='Average Salary of Data Jobs',
marginal="box",hover_data = df[['Job Title', 'Company Name']],opacity=0.8)
#fig.update_layout(barmode='overlay')
fig.show()
print(np.average(data_analyst['avg salary']))
print(np.average(data_scientist['avg salary']))
print(np.average(data_engineer['avg salary']))
71.15886524822695 107.29401993355482 99.90269930947898
def get_coordinates(loc_name):
geolocator = Nominatim(user_agent="test user agent")
location = geolocator.geocode(loc_name+ " US")
return (location.latitude, location.longitude)
row_number=0
geo_locations=pd.DataFrame()
for addr in tqdm(data_analyst.Location.unique()):
geo_locations.loc[row_number,'Location']=addr
coord=get_coordinates(addr)
geo_locations.loc[row_number,'Lat']=coord[0]
geo_locations.loc[row_number,'Lot']=coord[1]
row_number +=1
geo_locations.to_csv('Data_Analyst_Jobs_coordinates.csv', index=False)
df_count=df.Location.value_counts().rename_axis('Location').reset_index(name='counts')
job_locations=pd.merge(df_count, geo_locations, on="Location")
fig = px.scatter_mapbox(job_locations, lat="Lat", lon="Lot", size="counts",hover_name="Location",
title='Interactive Job Location Density Map',
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=3,
color_discrete_sequence=["crimson"], mapbox_style="carto-positron")
fig.update_layout(mapbox_style="stamen-terrain")
fig.show()
# Create a count Count dataframe to count the job numbers by each city
State_City_df =df.groupby(['State','Job Title'])['Job Title'].count().to_frame('Count').reset_index()
State_City_df['Country'] = 'US'
State_City_df=State_City_df.sort_values('Count', ascending = False)
# Use the Count dataframe to draw the treemap
fig = px.treemap(State_City_df, path=['Country','State','Job Title'], values='Count',
color= 'Count'
,color_continuous_scale='Blues', title = 'Job No. by State')
fig.data[0].textinfo = 'label+text+value+percent parent'
fig.show()
for data scientist and data engineer
- Texas, California, Illinois the most States for the data scientist job opening
for data engineer
- Texas, California, Arizonathe most States for the data engineer job opening
for data analyst
- California,Texas, New York the most States for the data analyst job opening
data1=data_analyst.groupby(['State'])[['salary upper bound','salary lower bound']].mean().sort_values(['salary upper bound','salary lower bound'],ascending=False)
data2=data_scientist.groupby(['State'])[['salary upper bound','salary lower bound']].mean().sort_values(['salary upper bound','salary lower bound'],ascending=False)
data3=data_engineer.groupby(['State'])[['salary upper bound','salary lower bound']].mean().sort_values(['salary upper bound','salary lower bound'],ascending=False)
#put dataframes into object for easy access:
dataset_dict = {'Data Analyst': data1,'Data Scientist': data2,'Data Engineer': data3}
#create a figure from the graph objects (not plotly express) library
fig = go.Figure()
buttons = []
i = 0
n_cols = len(data1.columns)
#iterate through dataframes in dict
for job, dataset in dataset_dict.items():
#iterate through columns in dataframe (not including the year column)
for column in dataset:
#add a bar trace to the figure for the country we are on
if column == 'salary upper bound':
fig.add_trace(go.Bar(
name = column,
#x axis is "fuels" where dates are stored as per example
x = dataset.index,
#y axis is the data for the column we are on
y = dataset[column].to_list(),
#setting only the first country to be visible as default
visible = (i==0), marker_color= 'rgb(26, 118, 255)'
)
)
else :
fig.add_trace(go.Bar(
name = column,
#x axis is "fuels" where dates are stored as per example
x = dataset.index,
#y axis is the data for the column we are on
y = dataset[column].to_list(),
#setting only the first country to be visible as default
visible = (i==0), marker_color='rgb(55, 83, 109)'
)
)
#args is a list of booleans that tells the buttons which trace to show on click
args = [False] * len(dataset_dict)*(n_cols)
args[i*n_cols:(i+1)*n_cols] = [True]*n_cols
#create a button object for the country we are on
button = dict(label = job,
method = "update",
args=[{"visible": args}])
#add the button to our list of buttons
buttons.append(button)
#i is an iterable used to tell our "args" list which value to set to True
i+=1
fig.update_layout(
updatemenus=[
dict(
#change this to "buttons" for individual buttons
type="dropdown",
#this can be "left" or "right" as you like
direction="down",
#(1,1) refers to the top right corner of the plot
x = 1,
y = 1,
#the list of buttons we created earlier
buttons = buttons)
],
#stacked bar chart specified here
barmode = "stack",
#so the x axis increments once per year
xaxis = dict(dtick = 1))
for data analyst
- California, Arizona The highest Maximum salary in the states .
for data scientist
- Delaware, California, New York The highest Maximum salary in the states .
for data engineer
- California, Delaware, Pennsylvania The highest Maximum salary in the states .
df_size=df.groupby(["Size",'Job Title'])["Company Name"].count()
df_size=df_size.sort_values(ascending=False)
fig = px.bar(y=df_size.index.get_level_values(0),
x=df_size.values,
orientation='h',
color_discrete_sequence =['rgb(47,56,84)']*len(df_size),
opacity=0.7,
text=df_size.values,
animation_frame=df_size.index.get_level_values(1))
fig.update_traces(marker_line_color='rgb(142,250,273)',
marker_line_width=1.5,
texttemplate='%{text:.2s}')
fig.update_layout(width=1000,
showlegend=False,
xaxis_title="No. of companies",
yaxis_title="Company Size",
title="Company Size and their Company count", title_x=0.5)
fig.show()
for data analyst
- size from 10000+:51 the most data analyst job opening .
for data scientist
- size from 510+:51 the most data scientist job opening .
for data engineer
- size from 10000+:51 the most data engineer job opening .
company_size = df.groupby(["Size",'Job Title'])["Company Name"].value_counts()
company_size_df= pd.DataFrame(company_size).rename(columns={"Company Name": "count"}).reset_index()
company_size_df['Country'] = 'US'
# removing data for which the number of job is only 1
company_size_df.drop(company_size_df.loc[company_size_df['count'] ==1].index, inplace=True)
fig = px.treemap(company_size_df, path=['Country','Size', 'Company Name','Job Title'], values='count',color='count',
color_continuous_scale = 'Cividis',
title=('Companies with their Size and Job Counts'))
fig.update_layout(title_x=0.5)
fig.data[0].textinfo = 'label+text+percent parent'
fig.show()
df_rate=df.groupby(["Rating",'Job Title'])["Company Name"].count()
df_rate=df_rate.sort_values(ascending=False)
fig = px.bar(x=df_rate.index.get_level_values(0),
y=df_rate.values,
text=df_rate.values,
color_discrete_sequence =['rgb(158,202,225)']*len(df_rate),
animation_frame=df_rate.index.get_level_values(1),opacity=0.6)
fig.update_traces(texttemplate='%{text:.2s}',
marker_line_color='rgb(8,48,107)',
marker_line_width=1.5)
fig.update_layout(width=1000,
showlegend=False,
xaxis_title="No. of companies",
yaxis_title="Company Rate",
title="Company Rates and their Company count", title_x=0.5)
fig.show()
for data scientist
- rating 3, 4 the most companies rate for data scientist job opening
for data engineer
- rating 3, 4 the most companies rate for data engineer job opening
for data analyst
- rating 3, 4 the most companies rate for data analyst job opening
df_sector=df.groupby(["Sector",'Job Title'])["Company Name"].count()
df_sector=df_sector.sort_values(ascending=True)
fig = px.bar(y=df_sector.index.get_level_values(0),
x=df_sector.values,
color_discrete_sequence =['rgb(47,56,84)']*len(df_sector),
opacity=0.6,
animation_frame=df_sector.index.get_level_values(1))
fig.update_traces( marker_line_color='rgb(142,250,273)',
marker_line_width=1.5)
fig.update_layout(showlegend=False,
height=800,
xaxis_title="Number of job openings ",
yaxis_title="Sectors",
title="Number of job openings by Sectors", title_x=0.5)
fig.show()
for data analyst
- Information Technology
- Business Services
- Finance
- Health Care are the most data analyst job opening advertised sectors.
for data scientist
- Business Services -Biotech and pharmaceutical
- Finance
- Health Care -Insurance are the most data scientist job opening advertised sectors.
for data engineer
- Information Technology
- Business Services
- Finance -Aerospace & defense
- Manufacturing are the most data engineer job opening advertised sectors.
def get_keywords_languages_DA(df_keywords_list):
languages = ["Python", "C++", "MATLAB",'R' ,".NET", "C#", "JavaScript", "HTML",'c++', "Java", "Scala", "SQL"]
languages_freq = dict()
for item in languages:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
languages_freq[item] = counter
df_languages = pd.DataFrame(list(languages_freq.items()),columns = ['Languages','Count'])
df_languages['Count'] = (df_languages['Count'] / df_languages['Count'].sum()) * 100
df_languages = df_languages.sort_values(["Count"], axis=0, ascending=False)
return df_languages
def plot_programming_languages_DA(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame= get_keywords_languages_DA(job_keywords).head(10),
x="Languages",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(
showlegend=False,
xaxis_title="Keywords related to (Programming) Languages",
yaxis_title=f"% of {position} listings",
title="Programming Languages Skills", title_x=0.5)
return fig.show()
plot_programming_languages_DA(position='Data Analyst')
def get_keywords_tools_DA(df_keywords_list):
big_data = ['Excel', "Hadoop",'Power Bi','SAS','SAP','Tableau', "Spark", "Impala", "Cassandra", "Kafka", "HDFS", "Hive", "Kubeflow", "Airflow", "BigQuery"]
big_data_freq = dict()
for item in big_data:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
big_data_freq[item] = counter
df_big_data = pd.DataFrame(list(big_data_freq.items()),columns = ['Tools','Count'])
df_big_data['Count'] = (df_big_data['Count'] / df_big_data['Count'].sum()) * 100
df_big_data = df_big_data.sort_values(["Count"], axis=0, ascending=False)
return df_big_data
def plot_tools_and_tech_DA(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame=get_keywords_tools_DA(job_keywords).head(10),
x="Tools",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(width=1000,
showlegend=False,
xaxis_title="Keywords related to Tools and Technologies",
yaxis_title=f"% of {position} listings",
title="Tools and Technologies", title_x=0.5)
return fig.show()
plot_tools_and_tech_DA(position='Data Analyst')
def get_keywords_cloud(df_keywords_list):
cloud = ["AWS","Azure", "Google Cloud"]
cloud_freq = dict()
for item in cloud:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
cloud_freq[item] = counter
df_cloud = pd.DataFrame(list(cloud_freq.items()),columns = ['Cloud','Count'])
df_cloud['Count'] = (df_cloud['Count'] / df_cloud['Count'].sum()) * 100
df_cloud = df_cloud.sort_values(["Count"], axis=0, ascending=False)
return df_cloud
def plot_cloud(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame=get_keywords_cloud(job_keywords).head(10),
x="Cloud",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(width=1000,
showlegend=False,
xaxis_title="Keywords related to Cloud Computing and Storage",
yaxis_title=f"% of {position} listings",
title="Cloud Computing and Storage", title_x=0.5)
return fig.show()
plot_cloud(position='Data Analyst')
def get_keywords_edu(df_keywords_list):
exp_edu = ["PhD", "Master", "Bachelor"]
exp_edu_freq = dict()
for item in exp_edu:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
exp_edu_freq[item] = counter
df_exp_edu = pd.DataFrame(list(exp_edu_freq.items()),columns = ['Education','Count'])
df_exp_edu['Count'] = (df_exp_edu['Count'] / df_exp_edu['Count'].sum()) * 100
df_exp_edu = df_exp_edu.sort_values(["Count"], axis=0, ascending=False)
return df_exp_edu
def plot_edu(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame=get_keywords_edu(job_keywords).head(10),
x="Education",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(width=1000,
showlegend=False,
xaxis_title="Keywords related to Education",
yaxis_title=f"% of {position} listings",
title="Education", title_x=0.5)
return fig.show()
plot_edu(position='Data Analyst')
def get_keywords_languages_DS(df_keywords_list):
languages = ["Python", "C++", "MATLAB",'R' , "C#", "JavaScript", "Java", "Scala", "SQL"]
languages_freq = dict()
for item in languages:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
languages_freq[item] = counter
df_languages = pd.DataFrame(list(languages_freq.items()),columns = ['Languages','Count'])
df_languages['Count'] = (df_languages['Count'] / df_languages['Count'].sum()) * 100
df_languages = df_languages.sort_values(["Count"], axis=0, ascending=False)
return df_languages
def plot_programming_languages_DS(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame= get_keywords_languages_DS(job_keywords).head(10),
x="Languages",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(
showlegend=False,
xaxis_title="Keywords related to (Programming) Languages",
yaxis_title=f"% of {position} listings",
title="Programming Languages Skills", title_x=0.5)
return fig.show()
plot_programming_languages_DS(position='Data Scientist')
def get_keywords_tools_DS(df_keywords_list):
big_data = ["Hadoop",'Power Bi','SAS','SAP','Tableau', "Spark", "Impala",'HBase', "Cassandra", "Kafka", "HDFS", "Hive", "Kubeflow", "Airflow", "BigQuery"]
big_data_freq = dict()
for item in big_data:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
big_data_freq[item] = counter
df_big_data = pd.DataFrame(list(big_data_freq.items()),columns = ['Tools','Count'])
df_big_data['Count'] = (df_big_data['Count'] / df_big_data['Count'].sum()) * 100
df_big_data = df_big_data.sort_values(["Count"], axis=0, ascending=False)
return df_big_data
def plot_tools_and_tech_DS(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame=get_keywords_tools_DS(job_keywords).head(10),
x="Tools",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(width=1000,
showlegend=False,
xaxis_title="Keywords related to Tools and Technologies",
yaxis_title=f"% of {position} listings",
title="Tools and Technologies", title_x=0.5)
return fig.show()
plot_tools_and_tech_DS(position='Data Scientist')
plot_cloud(position='Data Scientist')
plot_edu(position='Data Scientist')
def get_keywords_languages_DE(df_keywords_list):
languages = ["Python", "C++",'R' , "C#", "JavaScript",'Php','Bash','Julia', "Java", "Scala", "SQL"]
languages_freq = dict()
for item in languages:
counter = 0
for it in df_keywords_list:
if item.lower() in it:
counter += 1
languages_freq[item] = counter
df_languages = pd.DataFrame(list(languages_freq.items()),columns = ['Languages','Count'])
df_languages['Count'] = (df_languages['Count'] / df_languages['Count'].sum()) * 100
df_languages = df_languages.sort_values(["Count"], axis=0, ascending=False)
return df_languages
def plot_programming_languages_DE(position):
job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]
fig = px.bar(data_frame= get_keywords_languages_DE(job_keywords).head(10),
x="Languages",
y="Count",
color_discrete_sequence =['rgb(47,56,84)']*len(job_keywords))
fig.update_layout(
showlegend=False,
xaxis_title="Keywords related to (Programming) Languages",
yaxis_title=f"% of {position} listings",
title="Programming Languages Skills", title_x=0.5)
return fig.show()
plot_programming_languages_DE(position='Data Engineer')
plot_tools_and_tech_DS(position='Data Engineer')
plot_cloud(position='Data Engineer')
plot_edu(position='Data Engineer')